% updated 12/20/2016

% model 1: Y = lam1*A1*Y + lam2*A2*Y + rho*Y_ttl + X*b + error (with firm fixed effects)
% model 2: Y = lam1*A1*Y + lam2*A2*Y + rho*Y_ttl + X*b + error (with time fixed effects)
% model 3: Y = lam1*A1*Y + lam2*A2*Y + rho*Y_ttl + X*b + error (with firm and time fixed effects)
%% CONTROL PANEL
%clear
parentpath = cd(cd('..'));

warning('ON','msg:id')
dataset = 'cati_merged_sdc_rnd'; % Choose from: {'cati','sdc_rnd','sdc_rnd_ctt','cati_merged_sdc_rnd','cati_merged_sdc_rnd_ctt'}

lnkyear = 5; % Choose from: {3,4,5,6,7}
indstry = 100; % Choose from: {1,10,100,1000}
sicdgit = 1; % Choose from: {1,10,100,1000}

timelag = 1; % time lag of RND stock

%siclink = 1; %if use the within-sector and cross-sector RND matrices
%siclink = 2; %if A2 is the jaffe proximity matrix
%siclink = 3; %if A2 is the mahalanobis proximity matrix

fac = 1e3; % scale factor for output

yr1 = 1966;
yr2 = 2006;

% drop the first t0 years (i.e. the starting year is (yr1+t0))
t0 = 1;
%%
switch dataset
    case 'cati'
        n = 11093;
    case 'sdc_rnd'
        n = 12253;
    case 'sdc_rnd_ctt'
        n = 21387;
    case 'cati_merged_sdc_rnd'
        n = 19654;
    case 'cati_merged_sdc_rnd_ctt'
        n = 27147;
    otherwise
        disp('No appropriate dataset specified.')
end
T = yr2-yr1+1;
%%
ID = zeros(n,2);
ID(:,1) = (1:n)';
for s = 1:T
    yr = yr1+s-1;
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID SIC SALES PROFIT RND EMPL FIXED_CAPITAL TOT_COST.csv'];
    flid = fopen(file);
    dat0 = textscan(flid,'%s %s %s %s %s %s %s %s','delimiter',';','headerlines',1);
    fclose(flid);
    %---------------------------------   
    FID = str2double(strtrim(dat0{1})); % FID
    SIC = str2double(strtrim(dat0{2})); % SIC
    FID(isnan(FID)) = 0;
    SIC(isnan(SIC)) = 0;
    SIC(floor(SIC/1000)==0) = 0; % drop firms with less than 4-digit SIC
    
    for i = 1:n
        if SIC(FID==i) > 0
            if SIC(FID==i) ~= ID(i,2)
                if ID(i,2) == 0
                    ID(i,2) = SIC(FID==i);
                else
                    ID(i,2) = -1;
                    %warning('msg:id','SIC code changes over time')
                end
            end
        end
    end
end
ID(:,2) = floor(ID(:,2)/sicdgit);
ID = ID(ID(:,2)>0,:); % drop firms with missing SIC
n = size(ID,1);

file = [parentpath '/' dataset '/location/' 'ID NATION_ID NATION CITY LOC_X LOC_Y.csv'];
flid = fopen(file);
dat0 = textscan(flid,'%s %s %s %s %s %s','delimiter',';','headerlines',1);
fclose(flid);
tmp1 = str2double(strtrim(dat0{1}));
tmp2 = str2double(strtrim(dat0{2}));

loc = zeros(size(ID,1),1);
for i = 1:n
    if sum(tmp1==ID(i,1)) == 1
        loc(i) = tmp2(tmp1==ID(i,1));
    end
end
ID = ID(loc==235,:);
n = size(ID,1);
%%
dm = cell(T,1);
X1 = cell(T,1);
X2 = cell(T,1);
X3 = cell(T,1);
X4 = cell(T,1);
for s = 1:T
    yr = yr1+s-1;
    %---------------------------------
    firm = zeros(n,2);
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID OUTPUT.csv'];
    flid = fopen(file);
    dat1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid1 = str2double(strtrim(dat1{1}));
    outp = str2double(strtrim(dat1{2}));
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr-timelag) '_ID RND_STOCK.csv'];
    flid = fopen(file);
    dat2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid2 = str2double(strtrim(dat2{1}));
    stck = str2double(strtrim(dat2{2}));
    %---------------------------------
    for i = 1:n
        if (sum(fid1==ID(i,1))==1)&&(sum(fid2==ID(i,1))==1)
            firm(i,1) = outp(fid1==ID(i,1))/fac;
            firm(i,2) = stck(fid2==ID(i,1));
        end
    end
    firm(isnan(firm)) = 0;
    %---------------------------------
    file = [parentpath '/' dataset '/tot_output_compustat/' int2str(yr) '_Compustat_US_deflated_TOT_OUTPUT.csv'];
    flid  = fopen(file);
    dta1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec1 = [str2double(strtrim(dta1{1})),str2double(strtrim(dta1{2}))/fac];
    sec1(isnan(sec1)) = 0;
    sec1(:,1) = floor(sec1(:,1)/sicdgit);
    %---------------------------------
    file = [parentpath '/' dataset '/tot_rnd_stock/' int2str(yr-timelag) '_Compustat_US_TOT_RND_STOCK.csv'];
    flid = fopen(file);
    dta2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec2 = [str2double(strtrim(dta2{1})),str2double(strtrim(dta2{2}))];
    sec2(isnan(sec2)) = 0;
    sec2(:,1) = floor(sec2(:,1)/sicdgit);
    %---------------------------------
    % add the sector-level data to the firm-level data
    aggr = zeros(n,2);
    for i = 1:n
        aggr(i,1) = sum(sec1(sec1(:,1)==ID(i,2),2))-firm(i,1);
        aggr(i,2) = sum(sec2(sec2(:,1)==ID(i,2),2))-firm(i,2);
    end
    %---------------------------------
    % load RND adjacency matrix
    file = [parentpath '/' dataset '/adjacency_matrix_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    load(file)
    A = A(ID(:,1),:);
    A = A(:,ID(:,1));
    
    if siclink == 1
        ID2 = floor(ID(:,2)/indstry);
        A1 = A.*bsxfun(@eq,ID2,ID2');
        A2 = A-A1;
    elseif siclink == 2
        file = [parentpath '/' dataset '/patents/' int2str(yr) '_jaffe_proximity.mat'];
        load(file)
        P = P(ID(:,1),:);
        P = P(:,ID(:,1));
        
        A1 = A;
        A2 = P;
    else
        file = [parentpath '/' dataset '/patents/' int2str(yr) '_mahalanobis_proximity.mat'];
        load(file)
        M = M(ID(:,1),:);
        M = M(:,ID(:,1));
        
        A1 = A;
        A2 = M;
    end
    %---------------------------------
    % indicate firms with missing observations
    dm{s} = ones(n,1);
    for i = 1:n
        if (min(firm(i,1:2))<=0)||(min(aggr(i,1:2))<0)
            dm{s}(i) = 0;
        end
    end
    %---------------------------------
    % define variables
    X1{s} = firm;
    X2{s} = A1*firm;
    X3{s} = A2*firm;
    X4{s} = aggr;
end
%%
% drop the first t0 year(s)
dt = cat(1,dm{:});
dt = dt(n*t0+1:end);
T1 = T-t0;
%---------------------------------
% drop firms that appear less than twice
di = (1:n*T1)';
dj = kron(ones(T1,1),(1:n)');
Dn = sparse(di,dj,dt);
dd = sum(Dn,1);
Dn = Dn(:,dd>1); % firms appearing more than once in the panel
np = size(Dn,2); % that number of firms

En = speye(n);
En = En(dd>1,:); % firms appearing more than once in the panel
ID = En*ID;
for s = 1:T
    dm{s} = En*dm{s};
    X1{s} = En*X1{s};
    X2{s} = En*X2{s};
    X3{s} = En*X3{s};
    X4{s} = En*X4{s};
end
%%
D1 = cell(T1,1);
D2 = cell(T1,1);
Yn = cell(T1,1);
Zn = cell(T1,1);
Qn = cell(T1,1);
pid = cell(T1,1);
tid = cell(T1,1);
for s = (t0+1):T
    Da = diag(dm{s});
    Da = Da(sum(Da,2)==1,:);
    Db = zeros(size(Da,1),T1);
    Db(:,s-t0) = 1;
    D1{s-t0} = Da;
    D2{s-t0} = Db;    
    
    Yn{s-t0} = Da*X1{s}(:,1);
    Zn{s-t0} = Da*[X2{s}(:,1),X3{s}(:,1),X4{s}(:,1),X1{s}(:,2)];
    Qn{s-t0} = Da*[X2{s}(:,2),X3{s}(:,2),X4{s}(:,2),X1{s}(:,2)];
    pid{s-t0} = Da*ID(:,1);
    tid{s-t0} = (s-t0)*ones(size(Da,1),1);
end

D1 = cat(1,D1{:});
D2 = cat(1,D2{:});
Yn = cat(1,Yn{:});
Zn = cat(1,Zn{:});
Qn = cat(1,Qn{:});

temp = [cat(1,pid{:}),cat(1,tid{:}),Yn,Zn,Qn];
name = cell(1,size(temp,2));
name{1} = 'pid';
name{2} = 'tid';
name{3} = 'y';
for j = 1:size(Zn,2)
    name{j+3} = ['z' num2str(j)];
end
for j = 1:size(Qn,2)
    name{j+3+size(Zn,2)} = ['q' num2str(j)];
end

filename = ['stata_data2\data_' num2str(siclink) '.csv'];

delete(filename);
csvwrite_with_headers(filename,temp,name);